Data Exploration

Exploratory Data Analysis

In order to do machine learning with our data, we need to develop a thorough understanding of the data and the relationships between the variables. We will do Exploratory Data Analysis (EDA) to understand the data and to prepare for modeling.

Text Data

To begin, we can explore the textual data gathered that relates to psychosis and cannabis. This text data will be essential for answering questions regarding public sentiment related to the impact of cannabis on psychosis and/or schizophrenia.

Reddit

library(tidyverse, quietly = TRUE, warn.conflicts = FALSE)

wiki_data <- read_csv("../data/clean_data/wiki_cleaned_text.csv")
Rows: 397 Columns: 2
-- Column specification --------------------------------------------------------
Delimiter: ","
chr (2): link, text

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
reddit_data <- read_csv(
        "../data/clean_data/reddit_cleaned_text.csv"
    )
Rows: 30000 Columns: 2
-- Column specification --------------------------------------------------------
Delimiter: ","
chr (2): label, text

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(ggwordcloud)
library(tidytext)
library(wordcloud)
wiki_data <- wiki_data %>%
    mutate(cannabis_label = case_when(
        str_detect(text, "cannabis|marijuana|weed|THC|CBD") ~ "cannabis",
        TRUE ~ "no cannabis"
    )) %>%
    mutate(schiz_label = case_when(
        str_detect(text, "psychosis|psychotic|schizophrenia|schizophrenic|schizo|schizoaffective|schizotypal|schizoid|schiz") ~ "psychosis",
        TRUE ~ "no psychosis"
    )) %>%
    mutate(label = case_when(
        cannabis_label == "cannabis" & schiz_label == "psychosis" ~ "cannabis and psychosis",
        cannabis_label == "cannabis" & schiz_label == "no psychosis" ~ "cannabis",
        cannabis_label == "no cannabis" & schiz_label == "psychosis" ~ "psychosis",
        TRUE ~ "no cannabis and no psychosis"
    ))
tidy_wiki <- wiki_data %>%
    distinct() %>%
    filter(!is.na(text)) %>%
    unnest_tokens(word, text) %>%
    group_by(label, word) %>%
    summarise(n = n()) %>%
    ungroup() %>%
    group_by(word) %>%
    mutate(word_count = sum(n)) %>%
    ungroup() %>%
    arrange(-word_count)
`summarise()` has grouped output by 'label'. You can override using the
`.groups` argument.
tidy_wiki <- tidy_wiki %>%
  anti_join(stop_words) %>%
  filter(word != "numbr")
Joining with `by = join_by(word)`
library(wordcloud)

tidy_wiki %>%
    select(word, word_count) %>%
    distinct() %>%
    with(wordcloud(word, word_count, max.words = 200, colors = brewer.pal(8, "Dark2")))

tidy_wiki %>%
    filter(stringi::stri_enc_isascii(word)) %>%
    group_by(label) %>%
    arrange(-n) %>%
    slice_head(n = 100) %>%
    ggplot(aes(label = word, size = n, color = label)) +
    geom_text_wordcloud_area(rm_outside = TRUE) +
    scale_size_area(max_size = 20) +
    theme_minimal() +
    facet_wrap(~label)

tidy_wiki %>%
    filter(stringi::stri_enc_isascii(word)) %>%
    group_by(label) %>%
    mutate(m = n / sum(n)) %>%
    arrange(-m) %>%
    slice_head(n = 50) %>%
    ggplot(aes(label = word, size = m, color = label)) +
    geom_text_wordcloud_area(rm_outside = TRUE) +
    scale_size_area(max_size = 20) +
    theme_minimal() +
    facet_wrap(~label)

tidy_wiki %>%
    mutate(nchar = nchar(word)) %>%
    group_by(label) %>%
    summarize(mean = mean(nchar))
A tibble: 4 x 2
label mean
<chr> <dbl>
cannabis 7.696249
cannabis and psychosis 7.767887
no cannabis and no psychosis 7.631243
psychosis 8.038816
wiki_data %>%
    distinct() %>%
    filter(!is.na(text)) %>%
    mutate(sentence_length = stringr::str_count(text, "\\S+")) %>%
    group_by(label) %>%
    summarize(mean = mean(sentence_length), median = median(sentence_length), n = n())
A tibble: 4 x 4
label mean median n
<chr> <dbl> <dbl> <int>
cannabis 783.2434 266.5 152
cannabis and psychosis 2391.0417 2006.5 24
no cannabis and no psychosis 388.6061 69.0 198
psychosis 2205.4706 1381.0 17
tidy_wiki %>%
    group_by(label) %>%
    summarise(n = n())
A tibble: 4 x 2
label n
<chr> <int>
cannabis 12902
cannabis and psychosis 7673
no cannabis and no psychosis 10812
psychosis 6183
tidy_wiki %>%
    filter(stringi::stri_enc_isascii(word)) %>%
    group_by(label) %>%
    arrange(-n) %>%
    slice_head(n = 10) %>%
    ggplot(aes(x = n, y = word, fill = label)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    facet_wrap(~label, scales = "free_y") +
    theme(legend.position = "none")

tidy_wiki %>%
    group_by(label) %>%
    mutate(total = sum(n)) %>%
    ggplot(aes(n/total, fill = label)) +
    geom_histogram(show.legend = FALSE) +
    facet_wrap(~label, ncol = 2, scales = "free_y")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tidy_wiki %>%
  group_by(label) %>%
  mutate(total = sum(n)) %>%
  arrange(label, -n) %>%
  mutate(rank = row_number(), term_freq = n/total) %>%
  ungroup() %>%
  ggplot(aes(rank, term_freq, color = label)) + 
  geom_line(size = 1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

tidy_wiki %>%
  group_by(label) %>%
  slice_max(n, n = 15) %>%
  ungroup() %>%
  ggplot(aes(n, fct_reorder(word, n), fill = label)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~label, ncol = 2, scales = "free")

tidy_wiki %>%
    select(word, word_count) %>%
    distinct() %>%
    filter(word_count >= 600) %>%
    ggplot() +
    geom_bar(aes(y = fct_reorder(word, word_count), x = word_count), stat = "identity")

tidy_reddit <- reddit_data %>%
    distinct() %>%
    filter(!is.na(text)) %>%
    unnest_tokens(word, text) %>%
    group_by(label, word) %>%
    summarise(n = n()) %>%
    ungroup() %>%
    group_by(word) %>%
    mutate(word_count = sum(n)) %>%
    ungroup() %>%
    arrange(-word_count)
`summarise()` has grouped output by 'label'. You can override using the
`.groups` argument.
tidy_reddit <- tidy_reddit %>%
  anti_join(stop_words) %>%
  filter(word != "numbr")
Joining with `by = join_by(word)`
tidy_reddit %>%
    select(word, word_count) %>%
    distinct() %>%
    with(wordcloud(word, word_count, max.words = 100, colors = brewer.pal(8, "Dark2")))

ggplot(
  tidy_reddit %>% filter(stringi::stri_enc_isascii(word)) %>% filter(n >= 2),
  aes(
    label = word, size = n, color = label
  )
) +
  geom_text_wordcloud_area(rm_outside = TRUE) +
  scale_size_area(max_size = 20) +
  theme_minimal() +
  facet_wrap(~label)
Warning message in wordcloud_boxes(data_points = points_valid_first, boxes = boxes, :
"Some words could not fit on page. They have been removed."

tidy_reddit %>%
    mutate(nchar = nchar(word)) %>%
    group_by(label) %>%
    summarize(mean = mean(nchar))
A tibble: 3 x 2
label mean
<chr> <dbl>
Psychosis 6.411458
schizophrenia 6.291242
weed 5.555046
reddit_data %>%
    distinct() %>%
    filter(!is.na(text)) %>%
    mutate(sentence_length = stringr::str_count(text, "\\S+")) %>%
    group_by(label) %>%
    summarize(mean = mean(sentence_length))
A tibble: 3 x 2
label mean
<chr> <dbl>
Psychosis 26.979381
schizophrenia 10.802083
weed 8.262626
tidy_reddit %>%
    group_by(label) %>%
    summarise(n = n())
A tibble: 3 x 2
label n
<chr> <int>
Psychosis 960
schizophrenia 491
weed 436
tidy_reddit %>%
    filter(stringi::stri_enc_isascii(word)) %>%
    group_by(label) %>%
    arrange(-n) %>%
    slice_head(n = 10) %>%
    ggplot(aes(x = n, y = word, fill = label)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    facet_wrap(~label, scales = "free_y")

tidy_reddit %>%
    group_by(label) %>%
    mutate(total = sum(n)) %>%
    ggplot(aes(n/total, fill = label)) +
    geom_histogram(show.legend = FALSE) +
    facet_wrap(~label, ncol = 3, scales = "free_y")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tidy_reddit %>%
  group_by(label) %>%
  mutate(total = sum(n)) %>%
  arrange(label, -n) %>%
  mutate(rank = row_number(), term_freq = n/total) %>%
  ungroup() %>%
  ggplot(aes(rank, term_freq, color = label)) + 
  geom_line(size = 1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

tidy_reddit %>%
  group_by(label) %>%
  slice_max(n, n = 15) %>%
  ungroup() %>%
  ggplot(aes(n, fct_reorder(word, n), fill = label)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~label, ncol = 3, scales = "free")

tidy_reddit %>%
    select(word, word_count) %>%
    distinct() %>%
    filter(word_count >= 15) %>%
    ggplot() +
    geom_bar(aes(y = fct_reorder(word, word_count), x = word_count), stat = "identity")

Record Data

s1 <- read_csv("../data/clean_data/s1file_clean.csv")
Rows: 549 Columns: 24
-- Column specification --------------------------------------------------------
Delimiter: ","
dbl (24): Code, Program, sex, age, age_psychosis, famhis, hospita, dui, dup,...

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
s2 <- read_csv("../data/clean_data/s2file_clean.csv")
Rows: 477 Columns: 23
-- Column specification --------------------------------------------------------
Delimiter: ","
dbl (23): code, Program, sex, age, agepsychosis, fampsic, hospita, dui, dup,...

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
setdiff(names(s1), names(s2))
  1. 'Code'
  2. 'age_psychosis'
  3. 'famhis'
  4. 'levelsocioeco'
  5. 'dasgl0'
  6. 'Diagnosisobinnary'
  7. 'insight'
setdiff(names(s2), names(s1))
  1. 'code'
  2. 'agepsychosis'
  3. 'fampsic'
  4. 'levelecon'
  5. 'cds0'
  6. 'Diagnosisbinario'
full_data <- s1 %>% 
    rename(
        code = Code,
        diagnosis = Diagnosisobinnary,
        family_history = famhis
    ) %>%
    left_join(
        s2 %>%
            rename(
                diagnosis = Diagnosisbinario,
                age_psychosis = agepsychosis,
                family_history = fampsic,
                levelsocioeco = levelecon
            )
    )
Joining with `by = join_by(code, Program, sex, age, age_psychosis,
family_history, hospita, dui, dup, levelsocioeco, urbanarea, livingwithparents,
unmarried, unemployed, years_edu, CannabisBinary, SAPS0, SANS0, Psychoticdim0,
Disorganizeddim0, Negativedimen0, diagnosis)`
full_data %>% head()
A tibble: 6 x 25
code Program sex age age_psychosis family_history hospita dui dup levelsocioeco ... CannabisBinary SAPS0 SANS0 Psychoticdim0 Disorganizeddim0 Negativedimen0 dasgl0 diagnosis insight cds0
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ... <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 22.24658 22.16324 2 2 2.0 1.0 1 ... 1 6 6 4 2 5 2 1 1 NA
2 1 0 32.67945 32.63779 1 2 0.5 0.5 2 ... 1 9 5 8 1 4 0 1 1 NA
3 1 0 30.23562 30.15228 2 1 36.0 1.0 NA ... 1 11 2 4 7 0 NA 1 NA NA
4 1 0 24.80274 24.46941 2 2 18.0 4.0 2 ... 1 14 10 8 6 7 0 0 2 NA
5 1 0 22.38082 21.38082 2 1 28.0 12.0 1 ... 1 11 22 4 7 18 2 0 2 NA
6 1 0 17.07671 16.99338 2 1 1.0 1.0 2 ... 1 10 5 4 6 5 0 1 2 NA
full_data <- full_data %>%
    mutate(
        across(
            c(code, Program, sex, family_history, diagnosis, 
            hospita, levelsocioeco, urbanarea, livingwithparents,
            unmarried, unemployed, CannabisBinary
            ),
            ~ as.factor(.x)
        )
    )
full_data %>%
    ggplot() +
    geom_bar(aes(x = diagnosis, fill = diagnosis)) +
    theme_minimal()

full_data %>%
    ggplot() +
    geom_bar(aes(x = diagnosis, fill = sex))

full_data %>%
    ggplot() +
    geom_bar(aes(x = diagnosis, fill = CannabisBinary))

library(corrplot)
full_data %>%
    select(-cds0) %>%
    select(where(is.numeric)) %>%
    # fill in all missing values with the mean
    mutate(across(where(is.numeric), ~ ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))) %>%
    # normalize all numeric variables
    mutate(across(where(is.numeric), ~ (.x - min(.x)) / (max(.x) - min(.x)))) %>%
    cor() %>%
    corrplot.mixed(order = 'AOE', upper = 'circle', tl.col = 'black')

full_data %>%
    select(-cds0) %>%
    select(where(is.factor)) %>%
    drop_na() %>%
    mutate(across(where(is.factor), ~ as.numeric(.x))) %>%
    # normalize all numeric variables
    mutate(across(where(is.numeric), ~ (.x - min(.x)) / (max(.x) - min(.x)))) %>%
    cor() %>%
    corrplot.mixed(order = 'AOE', upper = 'circle', tl.col = 'black')

full_data <- full_data %>%
    select(-SANS0, -age) %>%
    mutate(across(where(is.numeric), ~ ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)))
full_data %>% summary()
      code     Program sex     age_psychosis   family_history hospita   
 1      :  1   1:174   0:311   Min.   :14.81   1   :127       1   :379  
 2      :  1   2: 20   1:238   1st Qu.:21.65   2   :420       2   :169  
 3      :  1   3:203           Median :26.92   NA's:  2       NA's:  1  
 4      :  1   4:152           Mean   :28.92                            
 5      :  1                   3rd Qu.:34.25                            
 6      :  1                   Max.   :59.80                            
 (Other):543                                                            
      dui              dup         levelsocioeco urbanarea  livingwithparents
 Min.   :  0.10   Min.   :  0.06   1   :282      1   :386   1   :273         
 1st Qu.:  2.00   1st Qu.:  1.00   2   :249      2   :154   2   :270         
 Median : 10.00   Median :  3.00   NA's: 18      NA's:  9   NA's:  6         
 Mean   : 21.88   Mean   : 12.51                                             
 3rd Qu.: 24.00   3rd Qu.: 12.00                                             
 Max.   :288.00   Max.   :240.00                                             
                                                                             
 unmarried  unemployed   years_edu     CannabisBinary     SAPS0      
 1   :397   1   :235   Min.   : 6.00   0:236          Min.   : 1.00  
 2   :147   2   :308   1st Qu.: 8.00   1:313          1st Qu.:10.00  
 NA's:  5   NA's:  6   Median :10.00                  Median :14.00  
                       Mean   :10.13                  Mean   :13.76  
                       3rd Qu.:12.00                  3rd Qu.:17.00  
                       Max.   :17.00                  Max.   :25.00  
                                                                     
 Psychoticdim0    Disorganizeddim0 Negativedimen0       dasgl0      diagnosis
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :0.000   0:278    
 1st Qu.: 5.000   1st Qu.: 4.000   1st Qu.: 0.000   1st Qu.:0.000   1:271    
 Median : 7.000   Median : 5.000   Median : 2.000   Median :1.000            
 Mean   : 7.405   Mean   : 6.358   Mean   : 4.785   Mean   :1.412            
 3rd Qu.:10.000   3rd Qu.: 9.000   3rd Qu.: 8.000   3rd Qu.:3.000            
 Max.   :10.000   Max.   :15.000   Max.   :20.000   Max.   :5.000            
                                                                             
    insight           cds0    
 Min.   :1.000   Min.   : NA  
 1st Qu.:1.000   1st Qu.: NA  
 Median :2.000   Median : NA  
 Mean   :1.568   Mean   :NaN  
 3rd Qu.:2.000   3rd Qu.: NA  
 Max.   :2.000   Max.   : NA  
                 NA's   :549  
full_data %>%
    ggplot(aes(x = age_psychosis, y = years_edu, color = CannabisBinary)) +
    geom_jitter(alpha = 0.8) +
    facet_wrap(~CannabisBinary) +
    geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

full_data %>%names()
  1. 'code'
  2. 'Program'
  3. 'sex'
  4. 'age_psychosis'
  5. 'family_history'
  6. 'hospita'
  7. 'dui'
  8. 'dup'
  9. 'levelsocioeco'
  10. 'urbanarea'
  11. 'livingwithparents'
  12. 'unmarried'
  13. 'unemployed'
  14. 'years_edu'
  15. 'CannabisBinary'
  16. 'SAPS0'
  17. 'Psychoticdim0'
  18. 'Disorganizeddim0'
  19. 'Negativedimen0'
  20. 'dasgl0'
  21. 'diagnosis'
  22. 'insight'
  23. 'cds0'
full_data %>%
    ggplot(aes(x = age_psychosis, y = dui, color = diagnosis)) +
    geom_jitter(alpha = 0.8) +
    facet_wrap(~diagnosis) +
    geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

full_data %>%
    ggplot(aes(x = age_psychosis, y = dui, color = CannabisBinary)) +
    geom_jitter(alpha = 0.8) +
    facet_wrap(~CannabisBinary)

full_data %>%
    ggplot(aes(x = age_psychosis, y = years_edu, color = diagnosis)) +
    geom_jitter(alpha = 0.8) +
    facet_wrap(~diagnosis)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

full_data %>%
    ggplot() +
    geom_boxplot(aes(x = diagnosis, y = age_psychosis, fill = diagnosis))

full_data %>%
    ggplot() +
    geom_boxplot(aes(x = CannabisBinary, y = age_psychosis, fill = CannabisBinary))

full_data %>%
    ggplot() +
    geom_histogram(aes(x = age_psychosis, fill = diagnosis), bins = 20) +
    facet_wrap(~diagnosis)

full_data %>%
    ggplot() +
    geom_histogram(aes(x = age_psychosis, fill = sex), bins = 20) +
    facet_wrap(~sex)

full_data %>%
    ggplot() +
    geom_histogram(aes(x = age_psychosis, fill = CannabisBinary), bins = 20) +
    facet_wrap(~CannabisBinary)

full_data %>% names()
  1. 'code'
  2. 'Program'
  3. 'sex'
  4. 'age_psychosis'
  5. 'family_history'
  6. 'hospita'
  7. 'dui'
  8. 'dup'
  9. 'levelsocioeco'
  10. 'urbanarea'
  11. 'livingwithparents'
  12. 'unmarried'
  13. 'unemployed'
  14. 'years_edu'
  15. 'CannabisBinary'
  16. 'SAPS0'
  17. 'Psychoticdim0'
  18. 'Disorganizeddim0'
  19. 'Negativedimen0'
  20. 'dasgl0'
  21. 'diagnosis'
  22. 'insight'
  23. 'cds0'
full_data %>%
    ggplot() +
    geom_histogram(aes(x = Disorganizeddim0, fill = CannabisBinary), bins = 20) +
    facet_wrap(~CannabisBinary)

full_data %>%
    ggplot() +
    geom_histogram(aes(x = Negativedimen0, fill = CannabisBinary), bins = 20) +
    facet_wrap(~CannabisBinary)

full_data %>%
    ggplot() +
    geom_bar(aes(x = levelsocioeco, fill = CannabisBinary), bins = 20) +
    facet_wrap(~CannabisBinary)
Warning message in geom_bar(aes(x = levelsocioeco, fill = CannabisBinary), bins = 20):
"Ignoring unknown parameters: `bins`"

full_data %>%
    ggplot() +
    geom_histogram(aes(x = dasgl0, fill = CannabisBinary), bins = 20) +
    facet_wrap(~CannabisBinary)

full_data %>%
    group_by(CannabisBinary, diagnosis) %>%
    summarize(n = n()) %>%
    ungroup() %>%
    group_by(CannabisBinary) %>%
    mutate(diagnosis_gvn_cannabis = n / sum(n)) %>%
    ungroup() %>%
    group_by(diagnosis) %>%
    mutate(cannabis_gvn_diagnosis = n / sum(n)) %>%
    ungroup()
`summarise()` has grouped output by 'CannabisBinary'. You can override using
the `.groups` argument.
A tibble: 4 x 5
CannabisBinary diagnosis n diagnosis_gvn_cannabis cannabis_gvn_diagnosis
<fct> <fct> <int> <dbl> <dbl>
0 0 119 0.5042373 0.4280576
0 1 117 0.4957627 0.4317343
1 0 159 0.5079872 0.5719424
1 1 154 0.4920128 0.5682657
full_data %>% write_csv("../data/clean_data/full_data.csv")